import pandas as pd
import numpy as np
import cufflinks as cf
import seaborn as sns
import matplotlib.pyplot as plt
cf.go_offline()
url = r'/Users/Divir/Desktop/Geeklife/medical-malpractice-ml/NPDB1804.csv'
df = pd.read_csv(open(url, 'rb'), low_memory=False)
df.head()
# 1. Payment records only - RECTYPE = P
# 2. Non-state payments - FUNDPYMT = 0
df = df[(df['RECTYPE'] == 'P') & (df['FUNDPYMT'] == 0)]
df['PAYMENT'] = df['PAYMENT'].str.replace('$', '').astype(float)
ins_payments = df.loc[df['REPTYPE'] == 101, 'PAYMENT']
non_ins_payments = df.loc[df['REPTYPE'] == 102, 'PAYMENT']
fig, ax = plt.subplots(1, 2)
sns.distplot(ins_payments, ax=ax[0], hist_kws={'log': True}, kde=False, color='skyblue')
ax[0].set_title('Insurance Company Payments')
sns.distplot(non_ins_payments, ax=ax[1], hist_kws={'log': True}, kde=False, color='green')
ax[1].set_title('Non-Insurance Company Payments');
There doesn't seem to be much of a difference between insurance and non-insurance report payments. Both seem to have some big outliers (note that the axis is logarithmic).
pmt_by_year = df.groupby('ORIGYEAR')['PAYMENT'].sum()
pmt_by_year.iplot(kind='bar', title='Payments by Year', xTitle='Year', yTitle='Total payments')
Total payments don't seem to have changed much year over year either.
regions = {
'West': ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID'],
'South West': ['AZ', 'TX', 'NM', 'OK'],
'South East': ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ],
'Mid West': ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND'],
'North East': ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME'],
}
state_to_region = {state: region for region, states in regions.items() for state in states}
df['REGION'] = df['WORKSTAT'].map(state_to_region)
pmt_by_region = df.groupby('REGION')[['PAYMENT']].mean().reset_index()
pmt_by_region.iplot(kind='pie', title='Payment by region', labels='REGION', values='PAYMENT')
The North East and Mid West tend to account for about half of all the payments. This could be because of stricter laws in these states, but we would have to look these us further.
labels = ['XS', 'SM', 'MD', 'LG', 'XL']
df['PMTGROUP'] = pd.qcut(df['PAYMENT'], q=5, labels=labels)
pmt_group_by_region = df.groupby('REGION')['PMTGROUP'].value_counts(normalize=True).unstack()
pmt_group_by_region = pmt_group_by_region[labels]
pmt_group_by_region.iplot(kind='bar', barmode='stack')
The North East, which is the largest piece of the pie above, tends to have very large payments compared to other regions. Again, this could be due to regulations in these states.
age_map = {
-1: 'fetus',
0: 'baby',
70: 'elder',
80: 'elder',
90: 'elder'
}
df['PTAGEGROUP'] = df['PTAGE'].map(age_map)
groups = df.groupby('REGION')['PTAGEGROUP'].value_counts(normalize=True)
age_stats = groups.unstack()[['fetus', 'baby', 'elder']]
age_stats.iplot(kind='bar', title='Patient age by region')
We can see that the Mid West has about twice as many reports of fetuses as the West. We might expect payments to be higher for reports with fetuses, which we can explore next.
groups = df.groupby('PTAGEGROUP')['PAYMENT']
pmt_by_age_group = groups.describe().unstack()
pmt_by_age_group = pmt_by_age_group.loc[['fetus', 'baby', 'elder'], ['mean', 'std']]
pmt_by_age_group.T.iplot(kind='bar', )
Indeed payments for fetuses and babies tend to be much higher and with more variation.